import nltk
from nltk.tokenize import RegexpTokenizer

def split_questions_nltk(text):
    # 定义匹配题目编号的正则表达式
    tokenizer = RegexpTokenizer(r'\d+\.\s+.*?(?=\d+\.\s|\Z)', gaps=False)
    questions = tokenizer.tokenize(text)
    return questions

# 示例试卷文本
exam_text = """
一、这道题
1. What is the capital of France?
   a) Berlin
   b) Madrid
   c) Paris
   d) Rome

2. Which planet is known as the Red Planet?
   a) Earth
   b) Mars
   c) Jupiter
   d) Saturn
二、这道题
3. Who wrote 'Pride and Prejudice'?
   a) William Shakespeare
   b) Jane Austen
   c) Charles Dickens
   d) Mark Twain
"""

# 划分题目
questions = split_questions_nltk(exam_text)
for i, question in enumerate(questions, 1):
    print(f"Question {i}:\n{question}\n")
